import requests
from time import sleep
import pandas as pd
from time import sleep
from lxml import etree
from bs4 import BeautifulSoup


# driver = webdriver.Chrome(ChromeDriverManager().install())
# driver.get('https://collections-zoology.fieldmuseum.org/list?search_fulltext=%22Gorilla+gorilla%22&ss_DarOrder=&ss_DarFamily=&ss_DarGenus=&ss_DarSpecies=&ss_DarCollector=&ss_DarCatalogNumber=&ss_ColCollectionEventRefColSiteRef_PolPD1=&ss_ColCollectionEventRefColSiteRef_ClaWaterbodyOcean=&solr_document_69=All&ss_DarScientificName=&ss_CatCatalogSubset=&sm_PrvPreservation=&ss_DarPhylum=&ss_DarClass=&ss_DarTypeStatus=&ss_ColCollectionEventRefColSiteRef_PolPD2=&ss_ColCollectionEventRefColSiteRef_PolPD3=&ss_DarFieldNumber=&ss_ColCollectionEventRef_ColDateVisitedFrom=&ss_DarSubspecies=&ss_DarPreparations=&ss_DarIsland=&ss_DarSex=&ss_DarPreparationType=&irn=&ss_DarGlobalUniqueIdentifier=&sort_by=solr_document_21&sort_order=ASC&items_per_page=100&ss_CatTypesPresent=')
url = "https://collections-zoology.fieldmuseum.org/list?search_fulltext=%22Gorilla+gorilla%22&ss_DarOrder=&ss_DarFamily=&ss_DarGenus=&ss_DarSpecies=&ss_DarCollector=&ss_DarCatalogNumber=&ss_ColCollectionEventRefColSiteRef_PolPD1=&ss_ColCollectionEventRefColSiteRef_ClaWaterbodyOcean=&solr_document_69=All&ss_DarScientificName=&ss_CatCatalogSubset=&sm_PrvPreservation=&ss_DarPhylum=&ss_DarClass=&ss_DarTypeStatus=&ss_ColCollectionEventRefColSiteRef_PolPD2=&ss_ColCollectionEventRefColSiteRef_PolPD3=&ss_DarFieldNumber=&ss_ColCollectionEventRef_ColDateVisitedFrom=&ss_DarSubspecies=&ss_DarPreparations=&ss_DarIsland=&ss_DarSex=&ss_DarPreparationType=&irn=&ss_DarGlobalUniqueIdentifier=&sort_by=solr_document_21&sort_order=ASC&items_per_page=25&ss_CatTypesPresent="

payload = {}
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'en-US,en;q=0.9',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive',
    # 'Cookie': '_gid=GA1.2.1320823964.1677429784; zoodis=true; _ga=GA1.1.244909841.1677429784; _ga_MRXKH9KHF9=GS1.1.1677429784.1.1.1677431156.0.0.0',
    'If-None-Match': '"1677430844-1"',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'none',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
    'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"'
}

response = requests.request("GET", url, headers=headers, data=payload)

print(response.text)
response_text = response.text
# response_text = driver.page_source
soup_bs4 = BeautifulSoup(response_text, 'html5lib')

tree = etree.HTML(response_text)
# rec_div = soup_bs4.select('div[class*="views-row views-row"]')
# print(rec_div)
records_divs = (tree.xpath('//div[contains(@class,"views-row views-row")]'))
print(len(records_divs))
basic_data = []
for div in records_divs:
    # print(div)
    soup = BeautifulSoup(etree.tostring(div), 'html.parser')
    soup_tree = etree.HTML(str(soup))
    irn = BeautifulSoup(etree.tostring(soup_tree.xpath('//span[text()="IRN: "]/following-sibling::*')[0]),
                        'html5lib').text
    print(irn)
    irn_link = BeautifulSoup(etree.tostring(soup_tree.xpath('//span[text()="IRN: "]/following-sibling::*/a')[0]),
                             'html5lib').find('a')['href']
    # print(irn_link)

    try:
        catlog_subset = BeautifulSoup(
            etree.tostring(soup_tree.xpath('//span[text()="Catalog Subset: "]/following-sibling::*')[0]),
            'html5lib').text
    except:
        catlog_subset = 'No Entry Available.'
        pass
    try:
        higher_class = BeautifulSoup(
            etree.tostring(soup_tree.xpath('//span[text()="Higher Classification: "]/following-sibling::*')[0]),
            'html5lib').text
    except:
        higher_class = 'No Entry Available.'
        pass
    try:
        catlog_num = BeautifulSoup(
            etree.tostring(soup_tree.xpath('//span[text()="Catalog number: "]/following-sibling::*')[0]),
            'html5lib').text
    except:
        catlog_num = 'No Entry Available.'
        pass
    try:
        taxonomic_name = BeautifulSoup(
            etree.tostring(soup_tree.xpath('//span[text()="Taxonomic Name: "]/following-sibling::*')[0]),
            'html5lib').text
    except:
        taxonomic_name = 'No Entry Available.'
        pass
    try:
        dwc_locality = BeautifulSoup(
            etree.tostring(soup_tree.xpath('//span[text()="DwC Locality: "]/following-sibling::*')[0]),
            'html5lib').text
    except:
        dwc_locality = 'No Entry Available.'
        pass
    try:
        collector_field = BeautifulSoup(
            etree.tostring(soup_tree.xpath('//span[text()="Collector/field: "]/following-sibling::*')[0]),
            'html5lib').text
    except:
        collector_field = 'No Entry Available.'
        pass
    try:
        collection_no = BeautifulSoup(
            etree.tostring(soup_tree.xpath('//span[text()="Collection No.: "]/following-sibling::*')[0]),
            'html5lib').text
    except:
        collection_no = 'No Entry Available.'
        pass
    try:
        coordinates_available = BeautifulSoup(
            etree.tostring(soup_tree.xpath('//span[text()="Coordinates Available?: "]/following-sibling::*')[0]),
            'html5lib').text
    except:
        coordinates_available = 'No Entry Available.'
        pass
    try:
        tissue_available = BeautifulSoup(
            etree.tostring(soup_tree.xpath('//span[text()="Tissue Available?: "]/following-sibling::*')[0]),
            'html5lib').text
    except:
        tissue_available = 'No Entry Available.'
        pass
    try:
        sex = BeautifulSoup(etree.tostring(soup_tree.xpath('//span[text()="Sex: "]/following-sibling::*')[0]),
                            'html5lib').text
    except:
        sex = 'No Entry Available.'
        pass
    basic_data.append(
        [irn_link, irn, catlog_subset, higher_class, catlog_num, taxonomic_name, dwc_locality, collector_field,
         collection_no, coordinates_available, tissue_available, sex])
    print([irn_link, irn, catlog_subset, higher_class, catlog_num, taxonomic_name, dwc_locality, collector_field,
           collection_no, coordinates_available, tissue_available, sex])
main_data = []

for record in basic_data:
    sleep(4)
    # drive
    while True:
        url = "https://collections-zoology.fieldmuseum.org" + record[0]

        payload = {}
        headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
            'Accept-Language': 'en-US,en;q=0.9',
            'Cache-Control': 'max-age=0',
            'Connection': 'keep-alive',
            # 'Cookie': '_gid=GA1.2.1320823964.1677429784; zoodis=true; _ga_MRXKH9KHF9=GS1.1.1677503516.3.0.1677503516.0.0.0; _ga=GA1.2.244909841.1677429784; _gat_gtag_UA_57522038_1=1',
            'If-None-Match': '"1677503498-1"',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Sec-Fetch-User': '?1',
            'Upgrade-Insecure-Requests': '1',
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
            'sec-ch-ua': '"Chromium";v="110", "Not A(Brand";v="24", "Google Chrome";v="110"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"Windows"'
        }

        response = requests.request("GET", url, headers=headers, data=payload)
        print(url, response)
        if response.status_code == 200:
            break
        else:
            print('sleep for 5 seconds')
            sleep(10)
    response_text = response.text
    soup_bs4 = BeautifulSoup(response_text, 'html5lib')

    tree = etree.HTML(response_text)
    try:
        fm_catalog = BeautifulSoup(etree.tostring(tree.xpath('//span[text()="FM Catalog: "]/following-sibling::*')[0]),
                                   'html5lib').text
    except:
        fm_catalog = 'No Entry Available.'
        pass
    try:
        scientific_name = BeautifulSoup(
            etree.tostring(tree.xpath('//span[text()="Scientific Name: "]/following-sibling::*')[0]),
            'html5lib').text
    except:
        scientific_name = 'No Entry Available.'
        pass

    try:
        phylum = BeautifulSoup(etree.tostring(tree.xpath('//span[text()="Phylum: "]/following-sibling::*')[0]),
                               'html5lib').text
    except:
        phylum = 'No Entry Available.'
        pass
    try:
        class_ = BeautifulSoup(etree.tostring(tree.xpath('//span[text()="Class: "]/following-sibling::*')[0]),
                               'html5lib').text
    except:
        class_ = 'No Entry Available.'
        pass
    try:
        order = BeautifulSoup(etree.tostring(tree.xpath('//span[text()="Order: "]/following-sibling::*')[0]),
                              'html5lib').text
    except:
        order = 'No Entry Available.'
        pass
    try:
        family = BeautifulSoup(etree.tostring(tree.xpath('//span[text()="Family: "]/following-sibling::*')[0]),
                               'html5lib').text
    except:
        family = 'No Entry Available.'
        pass
    try:
        genus = BeautifulSoup(etree.tostring(tree.xpath('//span[text()="Genus: "]/following-sibling::*')[0]),
                              'html5lib').text
    except:
        genus = 'No Entry Available.'
        pass
    try:
        species = BeautifulSoup(etree.tostring(tree.xpath('//span[text()="Species: "]/following-sibling::*')[0]),
                                'html5lib').text
    except:
        species = 'No Entry Available.'
        pass
    try:
        field_no = BeautifulSoup(etree.tostring(tree.xpath('//span[text()="Field #: "]/following-sibling::*')[0]),
                                 'html5lib').text
    except:
        field_no = 'No Entry Available.'
        pass
    try:
        collector = BeautifulSoup(etree.tostring(tree.xpath('//span[text()="Collector: "]/following-sibling::*')[0]),
                                  'html5lib').text
    except:
        collector = 'No Entry Available.'
        pass
    try:
        collection_no = BeautifulSoup(
            etree.tostring(tree.xpath('//span[text()="Collection No.: "]/following-sibling::*')[0]),
            'html5lib').text
    except:
        collection_no = 'No Entry Available.'
        pass
    try:
        geography = BeautifulSoup(etree.tostring(tree.xpath('//span[text()="Geography: "]/following-sibling::*')[0]),
                                  'html5lib').text
    except:
        geography = 'No Entry Available.'
        pass
    try:
        date_collected = BeautifulSoup(
            etree.tostring(tree.xpath('//span[text()="Date Collected: "]/following-sibling::*')[0]),
            'html5lib').text
    except:
        date_collected = 'No Entry Available.'
        pass
    try:
        preparations = BeautifulSoup(
            etree.tostring(tree.xpath('//span[text()="Preparations: "]/following-sibling::*')[0]),
            'html5lib').text
    except:
        preparations = 'No Entry Available.'
        pass
    try:
        tissue_available = BeautifulSoup(
            etree.tostring(tree.xpath('//span[text()="Tissue Available?: "]/following-sibling::*')[0]),
            'html5lib').text
    except:
        tissue_available = 'No Entry Available.'
        pass
    try:
        coordinates_available = BeautifulSoup(
            etree.tostring(tree.xpath('//span[text()="Co-ordinates Available?: "]/following-sibling::*')[0]),
            'html5lib').text
    except:
        coordinates_available = 'No Entry Available.'
        pass
    try:
        sex = BeautifulSoup(etree.tostring(tree.xpath('//span[text()="Sex: "]/following-sibling::*')[0]),
                            'html5lib').text
    except:
        sex = 'No Entry Available.'
        pass
    try:
        catlog_subset = BeautifulSoup(
            etree.tostring(tree.xpath('//span[text()="Catalog Subset: "]/following-sibling::*')[0]),
            'html5lib').text
    except:
        catlog_subset = 'No Entry Available.'
        pass

    main_data.append(
        list(record[1:]) +[fm_catalog, catlog_subset, scientific_name, phylum, class_, order, family, genus, species, field_no, collector,
         collection_no, geography, date_collected, preparations, tissue_available, coordinates_available, sex])

    df = pd.DataFrame(main_data,
                      columns=['IRN', 'Catalog Subset', 'Higher Classification', 'Catalog Number', 'Taxonomic Name', 'DwC Locality', 'Collector/Field', 'Collection No.', 'Coordinates Available?',
                               'Tissue Available?', 'Sex', 'FM Catalog', 'Catalog Subset', 'Scientific Name',
                               'Phylum', 'Class', 'Order', 'Family', 'Genus',
                               'Species', 'Field #', 'Collector', 'Collection No.', 'Geography', 'Date Collected', 'Preparations', 'Tissue Available?',
                               'Coordinates Available?', 'Sex'])
    df.to_csv('fieldmuseum_gorilla gorilla gorilla data extraction.csv', index=False, encoding='utf-8')
